Description

This script analyzes UFC fight odds data.


Libraries

library(tidyverse)


Examine Data

Load data.

load("./Datasets/df_master.RData")

Get summary.

summary(df_master)
##      NAME               Date              Event               City          
##  Length:5916        Length:5916        Length:5916        Length:5916       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##     State             Country          FightWeightClass       Round      
##  Length:5916        Length:5916        Length:5916        Min.   :1.000  
##  Class :character   Class :character   Class :character   1st Qu.:1.000  
##  Mode  :character   Mode  :character   Mode  :character   Median :3.000  
##                                                           Mean   :2.428  
##                                                           3rd Qu.:3.000  
##                                                           Max.   :5.000  
##                                                                          
##     Method          Winner_Odds         Loser_Odds            Sex           
##  Length:5916        Length:5916        Length:5916        Length:5916       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##     fight_id       Result          FighterWeight   FighterWeightClass
##  Min.   :   1   Length:5916        Min.   :115.0   Length:5916       
##  1st Qu.: 740   Class :character   1st Qu.:135.0   Class :character  
##  Median :1480   Mode  :character   Median :155.0   Mode  :character  
##  Mean   :1480                      Mean   :163.9                     
##  3rd Qu.:2219                      3rd Qu.:185.0                     
##  Max.   :2958                      Max.   :265.0                     
##                                                                      
##      REACH            SLPM             SAPM             STRA       
##  Min.   :58.00   Min.   : 0.000   Min.   : 0.100   Min.   :0.0000  
##  1st Qu.:69.00   1st Qu.: 2.680   1st Qu.: 2.630   1st Qu.:0.3900  
##  Median :72.00   Median : 3.440   Median : 3.220   Median :0.4400  
##  Mean   :71.77   Mean   : 3.527   Mean   : 3.429   Mean   :0.4415  
##  3rd Qu.:75.00   3rd Qu.: 4.250   3rd Qu.: 4.030   3rd Qu.:0.4900  
##  Max.   :84.00   Max.   :11.140   Max.   :23.330   Max.   :0.8000  
##  NA's   :211                                                       
##       STRD              TD              TDA              TDD        
##  Min.   :0.0900   Min.   : 0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.5100   1st Qu.: 0.560   1st Qu.:0.2700   1st Qu.:0.5100  
##  Median :0.5600   Median : 1.210   Median :0.3700   Median :0.6400  
##  Mean   :0.5527   Mean   : 1.519   Mean   :0.3746   Mean   :0.6162  
##  3rd Qu.:0.6000   3rd Qu.: 2.160   3rd Qu.:0.5000   3rd Qu.:0.7600  
##  Max.   :0.9200   Max.   :14.190   Max.   :1.0000   Max.   :1.0000  
##                                                                     
##       SUBA        
##  Min.   : 0.0000  
##  1st Qu.: 0.1000  
##  Median : 0.4000  
##  Mean   : 0.5553  
##  3rd Qu.: 0.8000  
##  Max.   :12.1000  
## 

Redefine variables.

df_master$NAME = as.factor(df_master$NAME)
df_master$Date = as.Date(df_master$Date)
df_master$Event = as.factor(df_master$Event)
df_master$City= as.factor(df_master$City)
df_master$State = as.factor(df_master$State)
df_master$Country = as.factor(df_master$Country)
df_master$FightWeightClass = as.factor(df_master$FightWeightClass)
df_master$Method = as.factor(df_master$Method)
df_master$Winner_Odds = as.numeric(df_master$Winner_Odds)
df_master$Loser_Odds = as.numeric(df_master$Loser_Odds)
df_master$fight_id = as.factor(df_master$fight_id)
df_master$Sex = as.factor(df_master$Sex)
df_master$Result = as.factor(df_master$Result)
df_master$FighterWeightClass = as.factor(df_master$FighterWeightClass)

Summarize again… There are infinite odds and overturned / DQ fight outcomes. These will have to be removed.

summary(df_master)
##                  NAME           Date           
##  Donald Cerrone    :  24   Min.   :2013-04-27  
##  Ovince Saint Preux:  21   1st Qu.:2015-08-08  
##  Jim Miller        :  19   Median :2017-04-22  
##  Derrick Lewis     :  18   Mean   :2017-06-03  
##  Neil Magny        :  18   3rd Qu.:2019-03-30  
##  Tim Means         :  18   Max.   :2020-12-19  
##  (Other)           :5798                       
##                                   Event                  City     
##  UFC Fight Night: Poirier vs. Gaethje:  28   Las Vegas     :1222  
##  UFC Fight Night: Whittaker vs. Till :  28   Abu Dhabi     : 210  
##  UFC 190: Rousey vs Correia          :  26   Boston        : 124  
##  UFC 193: Rousey vs Holm             :  26   Rio de Janeiro: 124  
##  UFC 210: Cormier vs. Johnson 2      :  26   Chicago       : 120  
##  UFC 224: Nunes vs. Pennington       :  26   Newark        : 114  
##  (Other)                             :5756   (Other)       :4002  
##         State                      Country          FightWeightClass
##  Nevada    :1222   USA                 :3442   Lightweight  : 978   
##  Texas     : 256   Brazil              : 532   Welterweight : 978   
##  New York  : 252   Canada              : 378   Bantamweight : 840   
##  California: 250   Australia           : 236   Featherweight: 712   
##  Abu Dhabi : 210   United Arab Emirates: 210   Middleweight : 646   
##  Florida   : 176   United Kingdom      : 184   Flyweight    : 484   
##  (Other)   :3550   (Other)             : 934   (Other)      :1278   
##      Round              Method      Winner_Odds     Loser_Odds       Sex      
##  Min.   :1.000   DQ        :  14   Min.   :1.06   Min.   :1.07   Female: 754  
##  1st Qu.:1.000   KO/TKO    :1890   1st Qu.:1.42   1st Qu.:1.77   Male  :5162  
##  Median :3.000   M-DEC     :  34   Median :1.71   Median :2.38                
##  Mean   :2.428   Overturned:  20   Mean   : Inf   Mean   : Inf                
##  3rd Qu.:3.000   S-DEC     : 620   3rd Qu.:2.33   3rd Qu.:3.36                
##  Max.   :5.000   SUB       :1052   Max.   : Inf   Max.   : Inf                
##                  U-DEC     :2286                                              
##     fight_id       Result     FighterWeight       FighterWeightClass
##  1      :   2   Loser :2958   Min.   :115.0   Welterweight : 999    
##  2      :   2   Winner:2958   1st Qu.:135.0   Lightweight  : 984    
##  3      :   2                 Median :155.0   Bantamweight : 783    
##  4      :   2                 Mean   :163.9   Featherweight: 711    
##  5      :   2                 3rd Qu.:185.0   Middleweight : 651    
##  6      :   2                 Max.   :265.0   Flyweight    : 541    
##  (Other):5904                                 (Other)      :1247    
##      REACH            SLPM             SAPM             STRA       
##  Min.   :58.00   Min.   : 0.000   Min.   : 0.100   Min.   :0.0000  
##  1st Qu.:69.00   1st Qu.: 2.680   1st Qu.: 2.630   1st Qu.:0.3900  
##  Median :72.00   Median : 3.440   Median : 3.220   Median :0.4400  
##  Mean   :71.77   Mean   : 3.527   Mean   : 3.429   Mean   :0.4415  
##  3rd Qu.:75.00   3rd Qu.: 4.250   3rd Qu.: 4.030   3rd Qu.:0.4900  
##  Max.   :84.00   Max.   :11.140   Max.   :23.330   Max.   :0.8000  
##  NA's   :211                                                       
##       STRD              TD              TDA              TDD        
##  Min.   :0.0900   Min.   : 0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.5100   1st Qu.: 0.560   1st Qu.:0.2700   1st Qu.:0.5100  
##  Median :0.5600   Median : 1.210   Median :0.3700   Median :0.6400  
##  Mean   :0.5527   Mean   : 1.519   Mean   :0.3746   Mean   :0.6162  
##  3rd Qu.:0.6000   3rd Qu.: 2.160   3rd Qu.:0.5000   3rd Qu.:0.7600  
##  Max.   :0.9200   Max.   :14.190   Max.   :1.0000   Max.   :1.0000  
##                                                                     
##       SUBA        
##  Min.   : 0.0000  
##  1st Qu.: 0.1000  
##  Median : 0.4000  
##  Mean   : 0.5553  
##  3rd Qu.: 0.8000  
##  Max.   :12.1000  
## 

How many events does the dataset include?

length(unique(df_master$Event))
## [1] 258

How many fights?

length(unique(df_master$fight_id))
## [1] 2958

Over what time frame?

range(sort(unique(df_master$Date)))
## [1] "2013-04-27" "2020-12-19"


Analyse Odds

Make copy for analysis.

df_odds = df_master
rm(df_master)

Filter out controversial results and infinite odds.

df_odds %>%
  dplyr::filter(
    (Method != "DQ") & (Method != "Overturned")
    , is.finite(Winner_Odds)
    , is.finite(Loser_Odds)
  ) -> df_odds

Get rid of fighter-specifics so that we can spread the data frame. This will give us one event per row.

df_odds %>%
  dplyr::select(-c(FighterWeight:SUBA)) %>%
  spread(Result, NAME) -> df_odds_short

How often were the (best) odds equal?

mean(df_odds$Winner_Odds == df_odds$Loser_Odds)
## [1] 0.005475702
sum(df_odds$Winner_Odds == df_odds$Loser_Odds)
## [1] 32

Filter out equal odds and identify if Favorite won the fight.

df_odds_short %>%
  dplyr::filter(Winner_Odds != Loser_Odds) %>%  # filter out equal odds
  dplyr::mutate(
    Favorite_was_Winner = ifelse(Winner_Odds < Loser_Odds, T, F)
    , Favorite_Unit_Profit = ifelse(Favorite_was_Winner, Winner_Odds - 1, -1)
    , Underdog_Unit_Profit = ifelse(!Favorite_was_Winner, Winner_Odds - 1, -1)
  ) -> df_odds_short

What was the mean unit profit (i.e. ROI) if one bet solely on the Favorite?

mean(df_odds_short$Favorite_Unit_Profit)
## [1] -0.0226841

What was the mean unit profit if one bet solely on the Underdog?

mean(df_odds_short$Underdog_Unit_Profit)
## [1] -0.002481074

What proportion of the time does the Favorite win?

mean(df_odds_short$Favorite_was_Winner)
## [1] 0.6462491

Calculate implied probability of each fight based on odds.

df_odds_short %>% dplyr::mutate(
  Favorite_Probability = ifelse(Favorite_was_Winner, 1/Winner_Odds, 1/Loser_Odds)
  , Underdog_Probability = ifelse(!Favorite_was_Winner,  1/Winner_Odds, 1/Loser_Odds)
) -> df_odds_short

Calculate overround for each fight.

NOTE: these odds are the best available odds for each fight / fighter. Therefore, this is not overround in the traditional sense (looking at one particular odds maker).

df_odds_short %>%
  dplyr::mutate(
    Total_Probability = Favorite_Probability + Underdog_Probability
    , Overround = Total_Probability - 1
  ) -> df_odds_short

There is very little overround. This is because we are picking the best odds for each fight / fighter. By picking the best odds, we are counteracting the built-in overround of any particular odds-maker (typically around 5% as a rough estimate).

mean(df_odds_short$Overround)
## [1] 0.004323413
mean(df_odds_short$Total_Probability)
## [1] 1.004323


Odds performance

Add year as variable.

df_odds_short %>%
  dplyr::mutate(
    Year = format(Date,"%Y")
  ) -> df_odds_short

Create function to graphically assess over performance as a function of several variables. These are not inferential analyses but are instead meant to visualize the data to observe trends for further analysis.

gauge_over_performance = function(num_bin = 10, min_bin_size = 30, variable = NULL) {

  # get bins for Favorite
  df_odds_short$Favorite_Probability_Bin = cut(df_odds_short$Favorite_Probability, num_bin)
  # get bins for Underdog
  df_odds_short$Underdog_Probability_Bin = cut(df_odds_short$Underdog_Probability, num_bin)

  if (is.null(variable)) {
    # check over/under performance for Favorites
    df_odds_short %>%
      dplyr::group_by(Favorite_Probability_Bin) %>%
      dplyr::summarise(
        Prop_of_Victory = mean(Favorite_was_Winner)
        , Size_of_Bin = length(Favorite_was_Winner)
        , ROI = mean(Favorite_Unit_Profit)
      ) -> fav_perf
  } else {

    # create dummy variable for function
    df_odds_short$Dummy = df_odds_short[
      ,which(colnames(df_odds_short) == sprintf("%s", variable))
    ]

    # check over/under performance for Favorites
    df_odds_short %>%
      dplyr::group_by(Favorite_Probability_Bin, Dummy) %>%
      dplyr::summarise(
        Prop_of_Victory = mean(Favorite_was_Winner)
        , Size_of_Bin = length(Favorite_was_Winner)
        , ROI = mean(Favorite_Unit_Profit)
      ) -> fav_perf
  }

  # extract bins
  fav_labs <- as.character(fav_perf$Favorite_Probability_Bin)
  fav_bins = as.data.frame(
    cbind(
      lower = as.numeric( sub("\\((.+),.*", "\\1", fav_labs) )
      , upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", fav_labs) )
    )
  )
  # get value in middle of bin
  fav_bins %>% dplyr::mutate(mid_bin = (lower + upper)/2 ) -> fav_bins
  # add mid bin column
  fav_perf$Mid_Bin = fav_bins$mid_bin
  # add Over performance column
  fav_perf %>% dplyr::mutate(Over_Performance = Prop_of_Victory - Mid_Bin) -> fav_perf


  if (is.null(variable)) {

    # plot over/under performance
    fav_perf %>%
      dplyr::filter(Size_of_Bin >= min_bin_size) %>%
      ggplot(aes(x=Mid_Bin*100, y=Over_Performance * 100))+
      geom_point()+
      geom_smooth(se=F)+
      geom_hline(yintercept = 0, linetype = "dotted")+
      ylab("Over Performance (%)")+
      xlab("Expected Probability (%)")+
      ggtitle("Favorites")->gg
    print(gg)

    # plot over/under performance
    fav_perf %>%
      dplyr::filter(Size_of_Bin >= min_bin_size) %>%
      ggplot(aes(x=Mid_Bin * 100, y=Prop_of_Victory*100))+
      geom_point()+
      geom_smooth(se=F)+
      ylab("Probability of Victory (%)")+
      xlab("Expected Probability (%)")+
      geom_abline(slope=1, intercept=0, linetype = "dotted")+
      ggtitle("Favorites")->gg
    print(gg)

    # plot ROI - only real difference is scale along y axis
    fav_perf %>%
      dplyr::filter(Size_of_Bin >= min_bin_size) %>%
      ggplot(aes(x=Mid_Bin*100, y= ROI* 100))+
      geom_point()+
      geom_smooth(se=F)+
      geom_hline(yintercept = 0, linetype = "dotted")+
      ylab("ROI (%)")+
      xlab("Expected Probability (%)")+
      ggtitle("Favorites") -> gg
    print(gg)

  } else {
    # plot over/under performance
    fav_perf %>%
      dplyr::filter(Size_of_Bin >= min_bin_size) %>%
      ggplot(aes(x=Mid_Bin*100, y=Over_Performance * 100, group=Dummy, colour = Dummy))+
      geom_point()+
      geom_smooth(se=F)+
      geom_hline(yintercept = 0, linetype = "dotted")+
      ylab("Over Performance (%)")+
      xlab("Expected Probability (%)")+
      ggtitle("Favorites")+
      labs(color=sprintf("%s", variable)) -> gg
    print(gg)

    # plot ROI - only real difference is scale along y axis
    fav_perf %>%
      dplyr::filter(Size_of_Bin >= min_bin_size) %>%
      ggplot(aes(x=Mid_Bin*100, y= ROI* 100, group=Dummy, colour = Dummy))+
      geom_point()+
      geom_smooth(se=F)+
      geom_hline(yintercept = 0, linetype = "dotted")+
      ylab("ROI (%)")+
      xlab("Expected Probability (%)")+
      ggtitle("Favorites")+
      labs(color=sprintf("%s", variable)) -> gg
    print(gg)
  }


  if (is.null(variable)) {

    # check over/under performance for Underdogs
    df_odds_short %>%
      dplyr::group_by(Underdog_Probability_Bin) %>%
      dplyr::summarise(
        Prop_of_Victory = mean(!Favorite_was_Winner)
        , Size_of_Bin = length(!Favorite_was_Winner)
        , ROI = mean(Underdog_Unit_Profit)
      ) -> under_perf

  } else {

    # check over/under performance for Underdogs
    df_odds_short %>%
      dplyr::group_by(Underdog_Probability_Bin, Dummy) %>%
      dplyr::summarise(
        Prop_of_Victory = mean(!Favorite_was_Winner)
        , Size_of_Bin = length(!Favorite_was_Winner)
        , ROI = mean(Underdog_Unit_Profit)
      ) -> under_perf
  }

  # extract bins
  under_labs <- as.character(under_perf$Underdog_Probability_Bin)
  under_bins = as.data.frame(
    cbind(
      lower = as.numeric( sub("\\((.+),.*", "\\1", under_labs) )
      , upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", under_labs) )
    )
  )
  # get value in middle of bin
  under_bins %>% dplyr::mutate(mid_bin = (lower + upper)/2 ) -> under_bins
  # add mid bin column
  under_perf$Mid_Bin = under_bins$mid_bin
  # add Over performance column
  under_perf %>% dplyr::mutate(Over_Performance = Prop_of_Victory - Mid_Bin) -> under_perf


  if (is.null(variable)) {
    # plot over/under performance
    under_perf %>%
      dplyr::filter(Size_of_Bin >= min_bin_size) %>%
      ggplot(aes(x=Mid_Bin*100, y=Over_Performance * 100))+
      geom_point()+
      geom_smooth(se=F)+
      geom_hline(yintercept = 0, linetype = "dotted")+
      ylab("Over Performance (%)")+
      xlab("Expected Probability (%)")+
      ggtitle("Underdogs")->gg
    print(gg)

    # plot over/under performance
    under_perf %>%
      dplyr::filter(Size_of_Bin >= min_bin_size) %>%
      ggplot(aes(x=Mid_Bin * 100, y=Prop_of_Victory*100))+
      geom_point()+
      geom_smooth(se=F)+
      ylab("Probability of Victory (%)")+
      xlab("Expected Probability (%)")+
      geom_abline(slope=1, intercept=0, linetype = "dotted")+
      ggtitle("Underdogs")->gg
    print(gg)

    under_perf %>%
      dplyr::filter(Size_of_Bin >= min_bin_size) %>%
      ggplot(aes(x=Mid_Bin*100, y=ROI * 100))+
      geom_point()+
      geom_smooth(se=F)+
      geom_hline(yintercept = 0, linetype = "dotted")+
      ylab("ROI (%)")+
      xlab("Expected Probability (%)")+
      ggtitle("Underdogs")-> gg
    print(gg)

  } else {

    # plot over/under performance
    under_perf %>%
      dplyr::filter(Size_of_Bin >= min_bin_size) %>%
      ggplot(aes(x=Mid_Bin*100, y=Over_Performance * 100, group=Dummy, colour = Dummy))+
      geom_point()+
      geom_smooth(se=F)+
      geom_hline(yintercept = 0, linetype = "dotted")+
      ylab("Over Performance (%)")+
      xlab("Expected Probability (%)")+
      ggtitle("Underdogs")+
      labs(color=sprintf("%s", variable)) -> gg
    print(gg)

    under_perf %>%
      dplyr::filter(Size_of_Bin >= min_bin_size) %>%
      ggplot(aes(x=Mid_Bin*100, y=ROI * 100, group=Dummy, colour = Dummy))+
      geom_point()+
      geom_smooth(se=F)+
      geom_hline(yintercept = 0, linetype = "dotted")+
      ylab("ROI (%)")+
      xlab("Expected Probability (%)")+
      ggtitle("Underdogs")+
      labs(color=sprintf("%s", variable)) -> gg
    print(gg)
  }
  
  # process to return()
  under_perf$Is_Fav = F
  under_perf %>%
    rename(Probability_Bin = Underdog_Probability_Bin) -> under_perf
  
  fav_perf$Is_Fav = T
  fav_perf %>%
    rename(Probability_Bin = Favorite_Probability_Bin) -> fav_perf
  
  return(rbind(fav_perf, under_perf))

}

Look at how expected performance predicts over performance.

gauge_over_performance(num_bin = 10, min_bin_size = 100, variable = NULL)

Is there any stability across years? Need to reduce minimum bin size to get estimates. As a result, estimates will be more noisy.

gauge_over_performance(num_bin = 10, min_bin_size = 30, variable = "Year")

Does the method of victory affect the relationship between odds and outcome? Reduce number of bins (compared to Year comparison above) to stabilize estimates. Graphs do not tell whole story due to number of data points available across bins.

odds_perf_by_method = gauge_over_performance(num_bin = 5, min_bin_size = 30, variable = "Method")

print(odds_perf_by_method)
## # A tibble: 47 x 8
## # Groups:   Probability_Bin [10]
##    Probability_Bin Dummy Prop_of_Victory Size_of_Bin      ROI Mid_Bin
##    <fct>           <fct>           <dbl>       <int>    <dbl>   <dbl>
##  1 (0.399,0.509]   KO/T…           0.5            14  0.0171    0.454
##  2 (0.399,0.509]   S-DEC           0.5             2  0         0.454
##  3 (0.399,0.509]   SUB             0.714           7  0.42      0.454
##  4 (0.399,0.509]   U-DEC           0.565          23  0.137     0.454
##  5 (0.509,0.617]   KO/T…           0.530         349 -0.0624    0.563
##  6 (0.509,0.617]   M-DEC           0.636          11  0.123     0.563
##  7 (0.509,0.617]   S-DEC           0.420         162 -0.255     0.563
##  8 (0.509,0.617]   SUB             0.543         197 -0.0432    0.563
##  9 (0.509,0.617]   U-DEC           0.554         453 -0.0223    0.563
## 10 (0.617,0.726]   KO/T…           0.676         296  0.00368   0.672
## # … with 37 more rows, and 2 more variables: Over_Performance <dbl>,
## #   Is_Fav <lgl>

How does fight finishing method vary with implied probability of vegas odds?

odds_perf_by_method %>%
  dplyr::filter(Is_Fav == T) %>%
  ggplot(aes(x=Mid_Bin, y=Size_of_Bin, group = Dummy, color = Dummy))+
  geom_point()+
  geom_smooth(se=F)+
  ylab("Count")+
  xlab("Implied Probability (%)")+
  ggtitle("Favorites")+
  labs(color="Method")

odds_perf_by_method %>%
  dplyr::filter(Is_Fav == F) %>%
  ggplot(aes(x=Mid_Bin, y=Size_of_Bin, group = Dummy, color = Dummy))+
  geom_point()+
  geom_smooth(se=F)+
  ylab("Count")+
  xlab("Implied Probability (%)")+
  ggtitle("Underdogs")+
  labs(color="Method")

Calculate the proportion of fights that end by various methods as a function of implied probability of fight odds.

odds_perf_by_method %>%
  group_by(Is_Fav, Mid_Bin) %>%
  summarise(Total_Count = sum(Size_of_Bin)) -> total_count

odds_perf_by_method %>%
  group_by(Is_Fav, Mid_Bin, Dummy) %>%
  summarise(Count= Size_of_Bin) -> single_count

method_count_by_odds = merge(single_count, total_count)
method_count_by_odds %>%
  dplyr::mutate(Method_Prop = Count / Total_Count ) -> method_count_by_odds

method_count_by_odds %>%
  dplyr::filter(Is_Fav == T) %>%
  ggplot(aes(x=Mid_Bin, y=Method_Prop, group = Dummy, color=Dummy))+
  geom_point()+
  geom_smooth(se=F)+
  ylab("Proportion")+
  xlab("Implied Probability (%)")+
  ggtitle("Favorites")+
  labs(color="Method")

method_count_by_odds %>%
  dplyr::filter(Is_Fav == F) %>%
  ggplot(aes(x=Mid_Bin, y=Method_Prop, group = Dummy, color=Dummy))+
  geom_point()+
  geom_smooth(se=F)+
  ylab("Proportion")+
  xlab("Implied Probability (%)")+
  ggtitle("Underdogs")+
  labs(color="Method")


Fighter Odds

Get rid of useless columns.

df_odds %>% dplyr::select(
  c(
    NAME
    , Event
    , Date
    , Result
    , Winner_Odds
    , Loser_Odds
  )
) -> df_odds_long

Summarize data.

summary(df_odds_long)
##                  NAME                                       Event     
##  Donald Cerrone    :  24   UFC Fight Night: Poirier vs. Gaethje:  28  
##  Ovince Saint Preux:  21   UFC Fight Night: Whittaker vs. Till :  28  
##  Jim Miller        :  19   UFC 190: Rousey vs Correia          :  26  
##  Derrick Lewis     :  18   UFC 193: Rousey vs Holm             :  26  
##  Neil Magny        :  18   UFC 210: Cormier vs. Johnson 2      :  26  
##  Tim Means         :  18   UFC 224: Nunes vs. Pennington       :  26  
##  (Other)           :5726   (Other)                             :5684  
##       Date               Result      Winner_Odds       Loser_Odds    
##  Min.   :2013-04-27   Loser :2922   Min.   : 1.060   Min.   : 1.070  
##  1st Qu.:2015-08-08   Winner:2922   1st Qu.: 1.420   1st Qu.: 1.770  
##  Median :2017-04-22                 Median : 1.710   Median : 2.380  
##  Mean   :2017-06-01                 Mean   : 1.975   Mean   : 2.811  
##  3rd Qu.:2019-03-30                 3rd Qu.: 2.300   3rd Qu.: 3.350  
##  Max.   :2020-12-19                 Max.   :12.990   Max.   :14.050  
## 

Add Fighter Odds column.

df_odds_long %>%
  dplyr::mutate(
    Fighter_Odds = ifelse(Result == "Winner", Winner_Odds, Loser_Odds)
  ) -> df_odds_long

Add Implied Probability column.

df_odds_long %>%
  dplyr::mutate(
    Implied_Probability = 1/Fighter_Odds
    , Won = ifelse(Result == "Winner", T, F)
    , Logit_Prob = qlogis(Implied_Probability)
  ) -> df_odds_long

Get performance and odds.

df_odds_long %>%
  dplyr::group_by(NAME) %>%
  dplyr::summarise(
    Exp_Prop = mean(Implied_Probability)
    , Logit_Exp_Prop = mean(Logit_Prob)
    , Win_Prop = mean(Won)
    , N_Fights = length(Won)
    , Over_Performance = Win_Prop - Exp_Prop
    , Logit_Over = qlogis(Win_Prop) - Logit_Exp_Prop
    , Back_Trans_Exp = plogis(Logit_Exp_Prop)
  ) -> df_odds_long_fighters

Top 10 over-performers with at least 5 fights where number of fights is simply number available in the dataset (see above).

df_odds_long_fighters %>%
  dplyr::filter(N_Fights >= 5) %>%
  dplyr::arrange(desc(Over_Performance))
# now with logit
df_odds_long_fighters %>%
  dplyr::filter(N_Fights >= 5) %>%
  dplyr::arrange(desc(Logit_Over))

Top 10 under performers with at least 5 fights.

df_odds_long_fighters %>%
  dplyr::filter(N_Fights >= 5) %>%
  dplyr::arrange(Over_Performance)
# with logit
df_odds_long_fighters %>%
  dplyr::filter(N_Fights >= 5) %>%
  dplyr::arrange(Logit_Over)

Most highly favorited fighters with at least 5 fights

df_odds_long_fighters %>%
  dplyr::filter(N_Fights >= 5) %>%
  dplyr::arrange(desc(Exp_Prop))
# with logit
df_odds_long_fighters %>%
  dplyr::filter(N_Fights >= 5) %>%
  dplyr::arrange(desc(Logit_Exp_Prop))

Most undervalued fighters with at least 5 fights.

df_odds_long_fighters %>%
  dplyr::filter(N_Fights >= 5) %>%
  dplyr::arrange(Exp_Prop)
# with logit
df_odds_long_fighters %>%
  dplyr::filter(N_Fights >= 5) %>%
  dplyr::arrange(Logit_Exp_Prop)

Examine odds for specific fighters.

# Israel Adesanya
df_odds_long_fighters %>% dplyr::filter(NAME == "Israel Adesanya")
# Anthony Smith
df_odds_long_fighters %>% dplyr::filter(NAME == "Anthony Smith")